The goal of this script is to generate a Seurat object for sample F31B.

  • removal of cells based on quality control metrics
  • normalization with LogNormalize, then doublets detection using scran hybrid and scDblFinder method, and doublet cells removal
  • normalization with LogNormalize, for only the remaining cells
  • cell cycle and cell type annotation
  • dimensionality reduction using PCA
  • projection using tSNE and UMAP
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
## [1] "/usr/local/lib/R/library"

Preparation

In this section, we set the global settings of the analysis. We will store data there :

out_dir = "."

We load the parameters :

sample_name = params$sample_name # "F31W"
# sample_name = "F31W"

Input count matrix is there :

count_matrix_dir = paste0(out_dir, "/input/", sample_name, "/raw_feature_bc_matrix")

We load the markers and specific colors for each cell type :

cell_markers = readRDS(paste0(out_dir, "/../1_metadata/wu_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
##          CD4 T cells          CD8 T cells     Langerhans cells 
##                   13                   13                    9 
##          macrophages              B cells              cuticle 
##                   10                   16                   15 
##               cortex              medulla                  IRS 
##                   16                   10                   16 
##        proliferative               HF-SCs            IFE basal 
##                   20                   17                   16 
## IFE granular spinous                  ORS          melanocytes 
##                   17                   15                   10 
##            sebocytes 
##                    8

Here are custom colors for each cell type :

color_markers = readRDS(paste0(out_dir, "/../1_metadata/wu_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())

We load markers to display on the dotplot :

dotplot_markers = readRDS(paste0(out_dir, "/../1_metadata/wu_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
## $`CD4 T cells`
## [1] "PTPRC" "CD3E"  "CD4"  
## 
## $`CD8 T cells`
## [1] "CD3E" "CD8A"
## 
## $`Langerhans cells`
## [1] "CD207" "CPVL" 
## 
## $macrophages
## [1] "TREM2" "MSR1" 
## 
## $`B cells`
## [1] "CD79A" "CD79B"
## 
## $cuticle
## [1] "MSX2"  "KRT32" "KRT35"
## 
## $cortex
## [1] "KRT31" "PRR9" 
## 
## $medulla
## [1] "BAMBI"   "ADLH1A3"
## 
## $IRS
## [1] "KRT71" "KRT73"
## 
## $proliferative
## [1] "TOP2A" "MCM5"  "TK1"  
## 
## $`HF-SCs`
## [1] "KRT14"  "CXCL14"
## 
## $`IFE basal`
## [1] "COL17A1" "KRT15"  
## 
## $`IFE granular spinous`
## [1] "SPINK5" "KRT1"  
## 
## $ORS
## [1] "KRT16" "KRT6C"
## 
## $melanocytes
## [1] "DCT"   "MLANA"
## 
## $sebocytes
## [1] "CLMP"  "PPARG"

We load metadata for this sample :

sample_info = readRDS(paste0(out_dir, "/../1_metadata/wu_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
##   project_name sample_type sample_identifier platform gender   location
## 1         F31B          HD           Wu_HD_2      10X      F hair scalp
##   laboratory     color
## 1         Wu slateblue

These is a parameter for different functions :

cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 6
cut_nFeature_RNA = 500
cut_percent.mt = 20
cut_percent.rb = 50

Load count matrix

In this section, we load the raw count matrix. Then, we applied an empty droplets filtering.

sobj = aquarius::load_sc_data(data_path = count_matrix_dir,
                              sample_name = sample_name,
                              my_seed = 1337L)
## [1]   27955 6794880
## [1] 123364408
## [1] 27955 77493
## [1] 121529265
## [1] 0.9851242
sobj
## An object of class Seurat 
## 27955 features across 77493 samples within 1 assay 
## Active assay: RNA (27955 features, 0 variable features)

(Time to run : 172.69 s)

We set an upper bound to 20,000 cells. Remaining cells correspond to ambient RNA.

if (ncol(sobj) > 20000) {
  sobj$nb_UMI = Seurat::GetAssayData(sobj, assay = "RNA", slot = "counts") %>%
    Matrix::colSums()
  
  thresh_20k = sort(sobj$nb_UMI, decreasing = TRUE)[20000]
  
  sobj = subset(sobj, nb_UMI >= thresh_20k)
  sobj$nb_UMI = NULL
  
  sobj
}
## An object of class Seurat 
## 27955 features across 20135 samples within 1 assay 
## Active assay: RNA (27955 features, 0 variable features)

In genes metadata, we add the Ensembl ID. The sobj@assays$RNA@meta.features dataframe contains three information :

  • rownames : gene names stored as the dimnames of the count matrix. Duplicated gene names will have a .1 at the end of their name
  • Ensembl_ID : EnsemblID, as stored in the features.tsv.gz file
  • gene_name : gene_name, as stored in the features.tsv.gz file. Duplicated gene names will have the same name.
features_df = read.csv(paste0(count_matrix_dir, "/features.tsv.gz"), sep = "\t", header = 0)
features_df = features_df[, c(1:2)]
colnames(features_df) = c("Ensembl_ID", "gene_name")
rownames(features_df) = rownames(sobj) # mandatory for Seurat::FindVariableFeatures

sobj@assays$RNA@meta.features = features_df
rm(features_df)

head(sobj@assays$RNA@meta.features)
##                  Ensembl_ID   gene_name
## MIR1302-2HG ENSG00000243485 MIR1302-2HG
## FAM138A     ENSG00000237613     FAM138A
## OR4F5       ENSG00000186092       OR4F5
## AL627309.1  ENSG00000238009  AL627309.1
## AL627309.3  ENSG00000239945  AL627309.3
## AL627309.4  ENSG00000241599  AL627309.4

We add the same columns as in metadata :

row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]
sobj$location = sample_info[row_oi, "location"]
sobj$laboratory = sample_info[row_oi, "laboratory"]

colnames(sobj@meta.data)
## [1] "orig.ident"        "nCount_RNA"        "nFeature_RNA"     
## [4] "log_nCount_RNA"    "project_name"      "sample_identifier"
## [7] "sample_type"       "location"          "laboratory"

Before filtering

Normalization

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 27955 features across 20135 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)

Projection

We generate a tSNE to visualize cells before filtering.

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj
## An object of class Seurat 
## 27955 features across 20135 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Cell type

We annotate cells for cell type using Seurat::AddModuleScore function.

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
## 
##          CD4 T cells          CD8 T cells     Langerhans cells 
##                   11                    4                   34 
##          macrophages              B cells              cuticle 
##                   17                  126                 2459 
##               cortex              medulla                  IRS 
##                 6783                  901                 1581 
##        proliferative               HF-SCs            IFE basal 
##                  435                  205                  224 
## IFE granular spinous                  ORS          melanocytes 
##                 1265                 5306                  347 
##            sebocytes 
##                  437

(Time to run : 61.88 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle phase

We annotate cells for cell cycle phase using Seurat and cyclone.

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##    G1   G2M     S 
## 14231  2336  3568
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##      
##         G1  G2M    S
##   G1  6038  827 1800
##   G2M 4638 1141  990
##   S   3555  368  778

(Time to run : 1190.69 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")

head(sobj@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA log_nCount_RNA
## AAACCCAAGAAGGGAT-1       F31B      36397         5126      10.502242
## AAACCCAAGCCAAGGT-1       F31B        636          342       6.455199
## AAACCCAAGCTCCGAC-1       F31B        555          338       6.318968
## AAACCCAAGCTGCGAA-1       F31B        544          350       6.298949
## AAACCCAAGCTTTCCC-1       F31B        566          353       6.338594
## AAACCCAAGGTCGCCT-1       F31B        792          484       6.674561
##                    project_name sample_identifier sample_type   location
## AAACCCAAGAAGGGAT-1         F31B           Wu_HD_2          HD hair scalp
## AAACCCAAGCCAAGGT-1         F31B           Wu_HD_2          HD hair scalp
## AAACCCAAGCTCCGAC-1         F31B           Wu_HD_2          HD hair scalp
## AAACCCAAGCTGCGAA-1         F31B           Wu_HD_2          HD hair scalp
## AAACCCAAGCTTTCCC-1         F31B           Wu_HD_2          HD hair scalp
## AAACCCAAGGTCGCCT-1         F31B           Wu_HD_2          HD hair scalp
##                    laboratory score_CD4_T_cells score_CD8_T_cells
## AAACCCAAGAAGGGAT-1         Wu      -0.011988469      -0.003625960
## AAACCCAAGCCAAGGT-1         Wu       0.000000000       0.000000000
## AAACCCAAGCTCCGAC-1         Wu      -0.004824548      -0.002315556
## AAACCCAAGCTGCGAA-1         Wu      -0.002427816      -0.002330474
## AAACCCAAGCTTTCCC-1         Wu      -0.002397054       0.000000000
## AAACCCAAGGTCGCCT-1         Wu      -0.002139229       0.000000000
##                    score_Langerhans_cells score_macrophages score_B_cells
## AAACCCAAGAAGGGAT-1           -0.018971118      -0.014439002  -0.004262379
## AAACCCAAGCCAAGGT-1            0.000000000      -0.003045190   0.000000000
## AAACCCAAGCTCCGAC-1            0.000000000      -0.009552606  -0.001900250
## AAACCCAAGCTGCGAA-1           -0.003859848       0.000000000  -0.001912492
## AAACCCAAGCTTTCCC-1           -0.003810941       0.000000000   0.000000000
## AAACCCAAGGTCGCCT-1           -0.010203121      -0.011295131   0.000000000
##                    score_cuticle score_cortex score_medulla  score_IRS
## AAACCCAAGAAGGGAT-1     0.4271370   0.06904114    -0.1596649 -0.6366430
## AAACCCAAGCCAAGGT-1     0.1409309   0.57187683     0.5386603  0.5563725
## AAACCCAAGCTCCGAC-1     0.2117394   0.64974727     0.1171012  0.2163121
## AAACCCAAGCTGCGAA-1     0.2718378   0.46986875     0.1362155  0.1892949
## AAACCCAAGCTTTCCC-1     0.5676369   0.89458440     0.1314865  0.3666754
## AAACCCAAGGTCGCCT-1     0.2212768   0.13689049     0.1760009  0.6300422
##                    score_proliferative score_HF-SCs score_IFE_basal
## AAACCCAAGAAGGGAT-1         -0.07698009 -0.053820275     -0.02855556
## AAACCCAAGCCAAGGT-1          0.14709367  0.024924172     -0.07742401
## AAACCCAAGCTCCGAC-1         -0.01563200 -0.156008362     -0.10947570
## AAACCCAAGCTGCGAA-1         -0.02720522  0.178459426     -0.11220827
## AAACCCAAGCTTTCCC-1         -0.01699434 -0.160285759     -0.11238948
## AAACCCAAGGTCGCCT-1          0.13342507  0.004949276     -0.14229040
##                    score_IFE_granular_spinous   score_ORS score_melanocytes
## AAACCCAAGAAGGGAT-1                 -0.4743133  0.30451667        -0.3059911
## AAACCCAAGCCAAGGT-1                 -0.2928820 -0.23960574         0.2237727
## AAACCCAAGCTCCGAC-1                  0.2010028 -0.08061869        -0.1356226
## AAACCCAAGCTGCGAA-1                 -0.1149472  0.42334484        -0.1531818
## AAACCCAAGCTTTCCC-1                  0.0158392 -0.04544535        -0.1387164
## AAACCCAAGGTCGCCT-1                 -0.2225996  0.42245975        -0.1747036
##                    score_sebocytes cell_type Seurat.Phase cyclone.Phase
## AAACCCAAGAAGGGAT-1     -0.02445698   cuticle           G1            G1
## AAACCCAAGCCAAGGT-1     -0.05470910   medulla          G2M            G1
## AAACCCAAGCTCCGAC-1     -0.11491697    cortex           G1            G1
## AAACCCAAGCTGCGAA-1     -0.09239270       ORS          G2M            G1
## AAACCCAAGCTTTCCC-1     -0.06561361   cuticle          G2M            G1
## AAACCCAAGGTCGCCT-1     -0.10159740       ORS           G1             S
##                    percent.mt percent.rb
## AAACCCAAGAAGGGAT-1  10.308542   21.72707
## AAACCCAAGCCAAGGT-1  15.723270   24.21384
## AAACCCAAGCTCCGAC-1   7.567568   22.16216
## AAACCCAAGCTGCGAA-1   2.022059   24.08088
## AAACCCAAGCTTTCCC-1   5.830389   22.79152
## AAACCCAAGGTCGCCT-1   2.651515   19.57071

We get the cell barcodes for the failing cells :

fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()

Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))
fsobj
## An object of class Seurat 
## 27955 features across 8298 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

fsobj = Seurat::NormalizeData(fsobj,
                              normalization.method = "LogNormalize",
                              assay = "RNA")

fsobj = Seurat::FindVariableFeatures(fsobj,
                                     assay = "RNA",
                                     nfeatures = 3000)
fsobj
## An object of class Seurat 
## 27955 features across 8298 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

We identify doublet cells :

fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)
## [1] 27955  8298
## 
## FALSE  TRUE 
##  7081  1217 
## [11:03:41] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## 
## FALSE  TRUE 
##  7572   726 
## 
## FALSE  TRUE 
##  6861  1437
fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)

(Time to run : 215.38 s)

We add the information in the non filtered Seurat object :

sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)

Quality control representation

We can visualize the 4 cells quality with a Venn diagram :

n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))

Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)

Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Number of features

To visualize the threshold for number of features, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)

Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)

Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)

Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")

sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the log_nCount_RNA by nFeature_RNA figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (doublets_consensus.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (scDblFinder.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

This is the figure, colored by the doublet cells status (hybrid_score.class) :

aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)

Visualization as piechart

Do filtered cells belong to a particular cell type ?

sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")

Doublet cells status

We can compare doublet detection methods with a Venn diagram :

ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))

We visualize cells annotation for doublets :

plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)

Save

We could save this object before filtering (remove eval = FALSE) :

saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))

Filtering

We remove :

  • cells with a number of UMI lower than 6
  • cells expressing a number of genes lower than 500
  • cells having more than 20 % of UMI related to mitochondrial genes
  • cells having more than 50 % of UMI related to ribosomal genes
  • doublet cells detected with both method (scDblFinder and scds-hybrid)
sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb,
                               fail_doublets_consensus)))
sobj
## An object of class Seurat 
## 27955 features across 4786 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Post-filtering processing

Normalization

We normalize the count matrix for remaining cells :

sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
## An object of class Seurat 
## 27955 features across 4786 samples within 1 assay 
## Active assay: RNA (27955 features, 3000 variable features)
##  2 dimensional reductions calculated: RNA_pca, RNA_pca_20_tsne

Projection

We perform a PCA :

sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")

We generate a tSNE and a UMAP with 20 principal components :

ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))

Annotation

We annotate cells for cell type, with the new normalized expression matrix :

score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
## 
##          CD4 T cells          CD8 T cells     Langerhans cells 
##                    5                    2                    8 
##          macrophages              B cells              cuticle 
##                   13                   21                  418 
##               cortex              medulla                  IRS 
##                  604                  196                  172 
##        proliferative               HF-SCs            IFE basal 
##                  201                   72                   76 
## IFE granular spinous                  ORS          melanocytes 
##                  597                 2200                  138 
##            sebocytes 
##                   63

(Time to run : 25.95 s)

To justify cell type annotation, we can make a dotplot :

markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))

Cell cycle

We annotate cells for cell cycle phase :

cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]
## 
##   G1  G2M    S 
## 4052  231  503
sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
##      
##         G1  G2M    S
##   G1  2174   72  261
##   G2M  846  114  119
##   S   1032   45  123

(Time to run : 410 s)

We visualize cell cycle on the projection :

plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)

Clustering

We make a highly resolutive clustering :

sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4786
## Number of edges: 165854
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7916
## Number of communities: 29
## Elapsed time: 0 seconds
table(sobj$seurat_clusters)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
## 540 539 376 331 326 243 220 211 194 169 168 161 155 127 127 122 117 110 103  96 
##  20  21  22  23  24  25  26  27  28 
##  63  55  52  43  42  34  32  16  14

Visualization

Cell type

We can visualize the cell type :

tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Cell cycle

We can visualize the cell cycle, from Seurat :

tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

We can visualize the cell cycle, from cyclone :

tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Clusters

We visualize the clustering :

tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap

Gene expression

We visualize all cell types markers on the tSNE :

markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)

Save

We save the annotated and filtered Seurat object :

saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))

R session

show
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.5   patchwork_1.1.2 dplyr_1.0.7    
## 
## loaded via a namespace (and not attached):
##   [1] softImpute_1.4              graphlayouts_0.7.0         
##   [3] pbapply_1.4-2               lattice_0.20-41            
##   [5] haven_2.3.1                 vctrs_0.3.8                
##   [7] usethis_2.0.1               dynwrap_1.2.1              
##   [9] blob_1.2.1                  survival_3.2-13            
##  [11] prodlim_2019.11.13          dynutils_1.0.5             
##  [13] later_1.3.0                 DBI_1.1.1                  
##  [15] R.utils_2.11.0              SingleCellExperiment_1.8.0 
##  [17] rappdirs_0.3.3              uwot_0.1.8                 
##  [19] dqrng_0.2.1                 jpeg_0.1-8.1               
##  [21] zlibbioc_1.32.0             pspline_1.0-18             
##  [23] pcaMethods_1.78.0           mvtnorm_1.1-1              
##  [25] htmlwidgets_1.5.4           GlobalOptions_0.1.2        
##  [27] future_1.22.1               UpSetR_1.4.0               
##  [29] laeken_0.5.2                leiden_0.3.3               
##  [31] clustree_0.4.3              parallel_3.6.3             
##  [33] scater_1.14.6               irlba_2.3.3                
##  [35] DEoptimR_1.0-9              tidygraph_1.1.2            
##  [37] Rcpp_1.0.9                  readr_2.0.2                
##  [39] KernSmooth_2.23-17          carrier_0.1.0              
##  [41] promises_1.1.0              gdata_2.18.0               
##  [43] DelayedArray_0.12.3         limma_3.42.2               
##  [45] graph_1.64.0                RcppParallel_5.1.4         
##  [47] Hmisc_4.4-0                 fs_1.5.2                   
##  [49] RSpectra_0.16-0             fastmatch_1.1-0            
##  [51] ranger_0.12.1               digest_0.6.25              
##  [53] png_0.1-7                   sctransform_0.2.1          
##  [55] cowplot_1.0.0               DOSE_3.12.0                
##  [57] ggvenn_0.1.9                here_1.0.1                 
##  [59] TInGa_0.0.0.9000            ggraph_2.0.3               
##  [61] pkgconfig_2.0.3             GO.db_3.10.0               
##  [63] DelayedMatrixStats_1.8.0    gower_0.2.1                
##  [65] ggbeeswarm_0.6.0            iterators_1.0.12           
##  [67] DropletUtils_1.6.1          reticulate_1.26            
##  [69] clusterProfiler_3.14.3      SummarizedExperiment_1.16.1
##  [71] circlize_0.4.15             beeswarm_0.4.0             
##  [73] GetoptLong_1.0.5            xfun_0.35                  
##  [75] bslib_0.3.1                 zoo_1.8-10                 
##  [77] tidyselect_1.1.0            reshape2_1.4.4             
##  [79] purrr_0.3.4                 ica_1.0-2                  
##  [81] pcaPP_1.9-73                viridisLite_0.3.0          
##  [83] rtracklayer_1.46.0          rlang_1.0.2                
##  [85] hexbin_1.28.1               jquerylib_0.1.4            
##  [87] dyneval_0.9.9               glue_1.4.2                 
##  [89] RColorBrewer_1.1-2          matrixStats_0.56.0         
##  [91] stringr_1.4.0               lava_1.6.7                 
##  [93] europepmc_0.3               DESeq2_1.26.0              
##  [95] recipes_0.1.17              labeling_0.3               
##  [97] httpuv_1.5.2                class_7.3-17               
##  [99] BiocNeighbors_1.4.2         DO.db_2.9                  
## [101] annotate_1.64.0             jsonlite_1.7.2             
## [103] XVector_0.26.0              bit_4.0.4                  
## [105] mime_0.9                    aquarius_0.1.5             
## [107] Rsamtools_2.2.3             gridExtra_2.3              
## [109] gplots_3.0.3                stringi_1.4.6              
## [111] processx_3.5.2              gsl_2.1-6                  
## [113] bitops_1.0-6                cli_3.0.1                  
## [115] batchelor_1.2.4             RSQLite_2.2.0              
## [117] randomForest_4.6-14         tidyr_1.1.4                
## [119] data.table_1.14.2           rstudioapi_0.13            
## [121] org.Mm.eg.db_3.10.0         GenomicAlignments_1.22.1   
## [123] nlme_3.1-147                qvalue_2.18.0              
## [125] scran_1.14.6                locfit_1.5-9.4             
## [127] scDblFinder_1.1.8           listenv_0.8.0              
## [129] ggthemes_4.2.4              gridGraphics_0.5-0         
## [131] R.oo_1.24.0                 dbplyr_1.4.4               
## [133] BiocGenerics_0.32.0         TTR_0.24.2                 
## [135] readxl_1.3.1                lifecycle_1.0.1            
## [137] timeDate_3043.102           ggpattern_0.3.1            
## [139] munsell_0.5.0               cellranger_1.1.0           
## [141] R.methodsS3_1.8.1           proxyC_0.1.5               
## [143] visNetwork_2.0.9            caTools_1.18.0             
## [145] codetools_0.2-16            Biobase_2.46.0             
## [147] GenomeInfoDb_1.22.1         vipor_0.4.5                
## [149] lmtest_0.9-38               msigdbr_7.5.1              
## [151] htmlTable_1.13.3            triebeard_0.3.0            
## [153] lsei_1.2-0                  xtable_1.8-4               
## [155] ROCR_1.0-7                  BiocManager_1.30.10        
## [157] scatterplot3d_0.3-41        abind_1.4-5                
## [159] farver_2.0.3                parallelly_1.28.1          
## [161] RANN_2.6.1                  askpass_1.1                
## [163] GenomicRanges_1.38.0        RcppAnnoy_0.0.16           
## [165] tibble_3.1.5                ggdendro_0.1-20            
## [167] cluster_2.1.0               future.apply_1.5.0         
## [169] Seurat_3.1.5                dendextend_1.15.1          
## [171] Matrix_1.3-2                ellipsis_0.3.2             
## [173] prettyunits_1.1.1           lubridate_1.7.9            
## [175] ggridges_0.5.2              igraph_1.2.5               
## [177] RcppEigen_0.3.3.7.0         fgsea_1.12.0               
## [179] remotes_2.4.2               scBFA_1.0.0                
## [181] destiny_3.0.1               VIM_6.1.1                  
## [183] testthat_3.1.0              htmltools_0.5.2            
## [185] BiocFileCache_1.10.2        yaml_2.2.1                 
## [187] utf8_1.1.4                  plotly_4.9.2.1             
## [189] XML_3.99-0.3                ModelMetrics_1.2.2.2       
## [191] e1071_1.7-3                 foreign_0.8-76             
## [193] withr_2.5.0                 fitdistrplus_1.0-14        
## [195] BiocParallel_1.20.1         xgboost_1.4.1.1            
## [197] bit64_4.0.5                 foreach_1.5.0              
## [199] robustbase_0.93-9           Biostrings_2.54.0          
## [201] GOSemSim_2.13.1             rsvd_1.0.3                 
## [203] memoise_2.0.0               evaluate_0.18              
## [205] forcats_0.5.0               rio_0.5.16                 
## [207] geneplotter_1.64.0          tzdb_0.1.2                 
## [209] caret_6.0-86                ps_1.6.0                   
## [211] DiagrammeR_1.0.6.1          curl_4.3                   
## [213] fdrtool_1.2.15              fansi_0.4.1                
## [215] highr_0.8                   urltools_1.7.3             
## [217] xts_0.12.1                  GSEABase_1.48.0            
## [219] acepack_1.4.1               edgeR_3.28.1               
## [221] checkmate_2.0.0             scds_1.2.0                 
## [223] cachem_1.0.6                npsurv_0.4-0               
## [225] babelgene_22.3              rjson_0.2.20               
## [227] openxlsx_4.1.5              ggrepel_0.9.1              
## [229] clue_0.3-60                 rprojroot_2.0.2            
## [231] stabledist_0.7-1            tools_3.6.3                
## [233] sass_0.4.0                  nichenetr_1.1.1            
## [235] magrittr_2.0.1              RCurl_1.98-1.2             
## [237] proxy_0.4-24                car_3.0-11                 
## [239] ape_5.3                     ggplotify_0.0.5            
## [241] xml2_1.3.2                  httr_1.4.2                 
## [243] assertthat_0.2.1            rmarkdown_2.18             
## [245] boot_1.3-25                 globals_0.14.0             
## [247] R6_2.4.1                    Rhdf5lib_1.8.0             
## [249] nnet_7.3-14                 RcppHNSW_0.2.0             
## [251] progress_1.2.2              genefilter_1.68.0          
## [253] statmod_1.4.34              gtools_3.8.2               
## [255] shape_1.4.6                 HDF5Array_1.14.4           
## [257] BiocSingular_1.2.2          rhdf5_2.30.1               
## [259] splines_3.6.3               AUCell_1.8.0               
## [261] carData_3.0-4               colorspace_1.4-1           
## [263] generics_0.1.0              stats4_3.6.3               
## [265] base64enc_0.1-3             dynfeature_1.0.0           
## [267] smoother_1.1                gridtext_0.1.1             
## [269] pillar_1.6.3                tweenr_1.0.1               
## [271] sp_1.4-1                    ggplot.multistats_1.0.0    
## [273] rvcheck_0.1.8               GenomeInfoDbData_1.2.2     
## [275] plyr_1.8.6                  gtable_0.3.0               
## [277] zip_2.2.0                   knitr_1.41                 
## [279] ComplexHeatmap_2.14.0       latticeExtra_0.6-29        
## [281] biomaRt_2.42.1              IRanges_2.20.2             
## [283] fastmap_1.1.0               ADGofTest_0.3              
## [285] copula_1.0-0                doParallel_1.0.15          
## [287] AnnotationDbi_1.48.0        vcd_1.4-8                  
## [289] babelwhale_1.0.1            openssl_1.4.1              
## [291] scales_1.1.1                backports_1.2.1            
## [293] S4Vectors_0.24.4            ipred_0.9-12               
## [295] enrichplot_1.6.1            hms_1.1.1                  
## [297] ggforce_0.3.1               Rtsne_0.15                 
## [299] shiny_1.7.1                 numDeriv_2016.8-1.1        
## [301] polyclip_1.10-0             grid_3.6.3                 
## [303] lazyeval_0.2.2              Formula_1.2-3              
## [305] tsne_0.1-3                  crayon_1.3.4               
## [307] MASS_7.3-54                 pROC_1.16.2                
## [309] viridis_0.5.1               dynparam_1.0.0             
## [311] rpart_4.1-15                zinbwave_1.8.0             
## [313] compiler_3.6.3              ggtext_0.1.0
---
params:
  sample_name: "F31W"
title: "OEP002321 dataset"
subtitle: "Sample `r params$sample_name`"
author: "Audrey"
date: "`r format(Sys.time(), '%Y-%m-%d')`"
output:
  html_document:
    code_folding: show
    code_download: true
    toc: true
    toc_float: true
    number_sections: false
---

<style>
body {
text-align: justify}
</style>

<!-- Automatically computes and prints in the output the running time for any code chunk -->
```{r, echo=FALSE}
# https://github.com/rstudio/rmarkdown/issues/1453
hooks = knitr::knit_hooks$get()
hook_foldable = function(type) {
  force(type)
  function(x, options) {
    res = hooks[[type]](x, options)
    
    if (isFALSE(options[[paste0("fold_", type)]])) return(res)
    
    paste0(
      "<details><summary>", "show", "</summary>\n\n",
      res,
      "\n\n</details>"
    )
  }
}
knitr::knit_hooks$set(
  output = hook_foldable("output"),
  plot = hook_foldable("plot"),
  time_it = local({
    now = NULL
    function(before, options) {
      if (options$time_it) {
        if (before) {
          now <<- Sys.time()
        } else {
          res = difftime(Sys.time(), now, units = "secs")
          paste("(Time to run :", round(res, digits = 2), "s)")
        }
      }
    }
  })
)
```

<!-- Set default parameters for all chunks -->
```{r, setup, include = FALSE}
set.seed(1337L)
knitr::opts_chunk$set(echo = TRUE, # display code
                      # display chunk output
                      message = FALSE,
                      warning = FALSE,
                      fold_output = FALSE, # usefull for sessionInfo()
                      fold_plot = FALSE,
                      
                      # figure settings
                      fig.align = 'center',
                      fig.width = 20,
                      fig.height = 15,
                      
                      # something about seed, chunk and Rmarkdown compilation
                      # https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-error-in-rmd-but-not-in-r-script
                      # cache = TRUE,
                      cache.lazy = FALSE, 
                      
                      # add runtime after chunk
                      time_it = FALSE)
```


The goal of this script is to generate a Seurat object for sample `r params$sample_name`.

* removal of cells based on quality control metrics
* normalization with `LogNormalize`, then doublets detection using `scran hybrid` and `scDblFinder` method, and doublet cells removal
* normalization with `LogNormalize`, for only the remaining cells
* cell cycle and cell type annotation
* dimensionality reduction using `PCA`
* projection using `tSNE` and `UMAP`


```{r library}
library(dplyr)
library(patchwork)
library(ggplot2)

.libPaths()
```

# Preparation

In this section, we set the global settings of the analysis. We will store data there :

```{r out_dir}
out_dir = "."
```

We load the parameters :

```{r get_param}
sample_name = params$sample_name # "F31W"
# sample_name = "F31W"
```

Input count matrix is there :

```{r count_matrix_dir}
count_matrix_dir = paste0(out_dir, "/input/", sample_name, "/raw_feature_bc_matrix")
```


We load the markers and specific colors for each cell type :

```{r cell_markers}
cell_markers = readRDS(paste0(out_dir, "/../1_metadata/wu_cell_markers.rds"))
cell_markers = lapply(cell_markers, FUN = toupper)
lengths(cell_markers)
```

Here are custom colors for each cell type :

```{r color_markers, fig.width = 12, fig.height = 1, class.source = "fold-hide"}
color_markers = readRDS(paste0(out_dir, "/../1_metadata/wu_color_markers.rds"))

data.frame(cell_type = names(color_markers),
           color = unlist(color_markers)) %>%
  ggplot2::ggplot(., aes(x = cell_type, y = 0, fill = cell_type)) +
  ggplot2::geom_point(pch = 21, size = 5) +
  ggplot2::scale_fill_manual(values = unlist(color_markers), breaks = names(color_markers)) +
  ggplot2::theme_classic() +
  ggplot2::theme(legend.position = "none",
                 axis.line = element_blank(),
                 axis.title = element_blank(),
                 axis.ticks = element_blank(),
                 axis.text.y = element_blank())
```


We load markers to display on the dotplot :

```{r dotplot_markers}
dotplot_markers = readRDS(paste0(out_dir, "/../1_metadata/wu_dotplot_markers.rds"))
dotplot_markers = lapply(dotplot_markers, FUN = toupper)
dotplot_markers
```


We load metadata for this sample :

```{r sample_info}
sample_info = readRDS(paste0(out_dir, "/../1_metadata/wu_sample_info.rds"))
sample_info %>%
  dplyr::filter(project_name == sample_name)
```


These is a parameter for different functions :

```{r global_settings}
cl = aquarius::create_parallel_instance(nthreads = 3L)
cut_log_nCount_RNA = 6
cut_nFeature_RNA = 500
cut_percent.mt = 20
cut_percent.rb = 50
```

# Load count matrix

In this section, we load the raw count matrix. Then, we applied an empty droplets filtering.

```{r load_count_matrix, time_it = TRUE}
sobj = aquarius::load_sc_data(data_path = count_matrix_dir,
                              sample_name = sample_name,
                              my_seed = 1337L)
sobj
```

We set an upper bound to 20,000 cells. Remaining cells correspond to ambient RNA.

```{r top_2000}
if (ncol(sobj) > 20000) {
  sobj$nb_UMI = Seurat::GetAssayData(sobj, assay = "RNA", slot = "counts") %>%
    Matrix::colSums()
  
  thresh_20k = sort(sobj$nb_UMI, decreasing = TRUE)[20000]
  
  sobj = subset(sobj, nb_UMI >= thresh_20k)
  sobj$nb_UMI = NULL
  
  sobj
}
```


In genes metadata, we add the Ensembl ID. The `sobj@assays$RNA@meta.features` dataframe contains three information :

* `rownames` : gene names stored as the dimnames of the count matrix. Duplicated gene names will have a `.1` at the end of their name
* `Ensembl_ID` : EnsemblID, as stored in the `features.tsv.gz` file
* `gene_name` : gene_name, as stored in the `features.tsv.gz` file. Duplicated gene names will have the same name.

```{r features_df}
features_df = read.csv(paste0(count_matrix_dir, "/features.tsv.gz"), sep = "\t", header = 0)
features_df = features_df[, c(1:2)]
colnames(features_df) = c("Ensembl_ID", "gene_name")
rownames(features_df) = rownames(sobj) # mandatory for Seurat::FindVariableFeatures

sobj@assays$RNA@meta.features = features_df
rm(features_df)

head(sobj@assays$RNA@meta.features)
```

We add the same columns as in metadata :

```{r add_metadata}
row_oi = (sample_info$project_name == sample_name)

sobj$project_name = sample_name
sobj$sample_identifier = sample_info[row_oi, "sample_identifier"]
sobj$sample_type = sample_info[row_oi, "sample_type"]
sobj$location = sample_info[row_oi, "location"]
sobj$laboratory = sample_info[row_oi, "laboratory"]

colnames(sobj@meta.data)
```

# Before filtering

## Normalization

```{r normalization}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We generate a tSNE to visualize cells before filtering.

```{r pca_before, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE with 20 principal components :

```{r tsne_before}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj
```

## Cell type

We annotate cells for cell type using `Seurat::AddModuleScore` function.

```{r cell_annot_custom_short, time_it = TRUE}
sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

table(sobj$cell_type)
```

To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short, fig.width = 12, fig.height = 9}
markers = c("PTPRC", "MSX2", "KRT16",
            unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```

## Cell cycle phase

We annotate cells for cell cycle phase using `Seurat` and `cyclone`.

```{r cell_cycle, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```

We visualize cell cycle on the projection :

```{r see_cell_cycle1, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```

# Quality control

In this section, we look at the number of genes expressed by each cell, the number of UMI, the percentage of mitochondrial genes expressed, and the percentage of ribosomal genes expressed. Then, without taking into account the cells expressing low number of genes or have low number of UMI, we identify doublet cells.

We compute four quality metrics :

```{r qc_metrics}
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^MT", col.name = "percent.mt")
sobj = Seurat::PercentageFeatureSet(sobj, pattern = "^RP[L|S][0-9]*$", col.name = "percent.rb")

head(sobj@meta.data)
```

We get the cell barcodes for the failing cells :

```{r failed}
fail_percent.mt = sobj@meta.data %>% dplyr::filter(percent.mt > cut_percent.mt) %>% rownames()
fail_percent.rb = sobj@meta.data %>% dplyr::filter(percent.rb > cut_percent.rb) %>% rownames()
fail_log_nCount_RNA = sobj@meta.data %>% dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA) %>% rownames()
fail_nFeature_RNA = sobj@meta.data %>% dplyr::filter(nFeature_RNA < cut_nFeature_RNA) %>% rownames()
```


## Doublet cells detection

Without taking into account the low UMI and low number of features cells, we identify doublets.

```{r fsobj}
fsobj = subset(sobj, invert = TRUE,
               cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA)))
fsobj
```

On this filtered dataset, we apply doublet cells detection. Just before, we run the normalization, taking into account only the remaining cells.

```{r normalization_doublets}
fsobj = Seurat::NormalizeData(fsobj,
                              normalization.method = "LogNormalize",
                              assay = "RNA")

fsobj = Seurat::FindVariableFeatures(fsobj,
                                     assay = "RNA",
                                     nfeatures = 3000)
fsobj
```

We identify doublet cells :

```{r doublet_detection, time_it = TRUE}
fsobj = aquarius::find_doublets(sobj = fsobj,
                                BPPARAM = cl)

fail_doublets_consensus = Seurat::WhichCells(fsobj, expression = doublets_consensus.class)
fail_doublets_scDblFinder = Seurat::WhichCells(fsobj, expression = scDblFinder.class)
fail_doublets_hybrid = Seurat::WhichCells(fsobj, expression = hybrid_score.class)
```

We add the information in the non filtered Seurat object :

```{r add_doublet_metrics}
sobj$doublets_consensus.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                                 colnames(sobj) %in% fail_doublets_consensus ~ TRUE,
                                                 !(colnames(sobj) %in% fail_doublets_consensus) ~ FALSE)

sobj$scDblFinder.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                          colnames(sobj) %in% fail_doublets_scDblFinder ~ TRUE,
                                          !(colnames(sobj) %in% fail_doublets_scDblFinder) ~ FALSE)

sobj$hybrid_score.class = dplyr::case_when(!(colnames(sobj) %in% colnames(fsobj)) ~ NA,
                                           colnames(sobj) %in% fail_doublets_hybrid ~ TRUE,
                                           !(colnames(sobj) %in% fail_doublets_hybrid) ~ FALSE)
```

```{r clean_fsobj, echo = FALSE}
rm(fsobj)
```


## Quality control representation

We can visualize the 4 cells quality with a Venn diagram : 

```{r qc_venn, class.source = "fold-hide", fig.width = 8, fig.height = 6}
n_filtered = c(fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA) %>%
  unique() %>% length()
percent_filtered = round(100*(n_filtered/ncol(sobj)), 2)

ggvenn::ggvenn(list(percent.mt = fail_percent.mt,
                    percent.rb = fail_percent.rb,
                    log_nCount_RNA = fail_log_nCount_RNA,
                    nFeature_RNA = fail_nFeature_RNA), 
               fill_color = c("#0073C2FF", "#EFC000FF", "orange", "pink"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::labs(title = "Filtered out cells",
                subtitle = paste0(n_filtered, " cells (", percent_filtered, " % of all cells)")) +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                 plot.subtitle = element_text(hjust = 0.5))
```


### Number of UMI

To visualize the threshold for number of UMI, we can make a histogram :

```{r qc_umi_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "log_nCount_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_log_nCount_RNA)
```

```{r vln_umi_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "log_nCount_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_log_nCount_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_umi_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_log_nCount_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "log_nCount_RNA",
                subtitle = paste0(length(fail_log_nCount_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Number of features

To visualize the threshold for number of features, we can make a histogram :

```{r qc_features_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "nFeature_RNA",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_nFeature_RNA)
```


```{r vln_features_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "nFeature_RNA", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_nFeature_RNA, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_features_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_nFeature_RNA,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "nFeature_RNA",
                subtitle = paste0(length(fail_nFeature_RNA), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Mitochondrial genes expression

To identify a threshold for mitochondrial gene expression, we can make a histogram :

```{r qc_mito_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.mt",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.mt)
```

```{r vln_percentmt_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.mt", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.mt, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_mito_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.mt,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.mt",
                subtitle = paste0(length(fail_percent.mt), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### Ribosomal genes expression

To identify a threshold for ribosomal gene expression, we can make a histogram :

```{r qc_ribo_hist, class.source = "fold-hide", fig.width = 10, fig.height = 5}
aquarius::plot_qc_density(df = sobj@meta.data,
                          x = "percent.rb",
                          bins = 200,
                          group_by = "orig.ident",
                          group_color = setNames(sample_info$color,
                                                 nm = sample_info$sample_identifiant),
                          x_thresh = cut_percent.rb)
```


```{r vln_percentrb_cell_type, fig.width = 12, fig.height = 6}
Seurat::VlnPlot(sobj, features = "percent.rb", pt.size = 0.001,
                group.by = "cell_type", cols = color_markers) +
  ggplot2::scale_fill_manual(values = color_markers, breaks = names(color_markers)) +
  ggplot2::geom_hline(yintercept = cut_percent.rb, col = "red") +
  ggplot2::labs(x = "")
```

```{r qc_ribo_proj, fig.width = 8, fig.height = 7}
sobj$fail = ifelse(colnames(sobj) %in% fail_percent.rb,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

Seurat::DimPlot(sobj, group.by = "fail", na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "percent.rb",
                subtitle = paste0(length(fail_percent.rb), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))
```

### FACS-like figure

We would like to see if the number of feature expressed by cell, and the number of UMI is correlated with the cell type, the percentage of mitochondrial and ribosomal gene expressed, and the doublet status. We build the `log_nCount_RNA` by `nFeature_RNA` figure, where cells (dots) are colored by these different metrics.

This is the figure, colored by cell type :

```{r qc_patchwork_cell_type, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "cell_type",
                           col_colors = unname(color_markers),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of mitochondrial genes expressed in cell :

```{r qc_patchwork_mito, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.mt",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the percentage of ribosomal genes expressed in cell :

```{r qc_patchwork_ribo, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "percent.rb",
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`doublets_consensus.class`) :

```{r qc_patchwork_doublet_consensus, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "doublets_consensus.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`scDblFinder.class`) :

```{r qc_patchwork_scdblfinder, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "scDblFinder.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```

This is the figure, colored by the doublet cells status (`hybrid_score.class`) :

```{r qc_patchwork_hybrid_score, class.source = "fold-hide", fig.width = 10, fig.height = 8}
aquarius::plot_qc_facslike(df = sobj@meta.data,
                           x = "nFeature_RNA",
                           y = "log_nCount_RNA",
                           col_by = "hybrid_score.class",
                           col_colors = setNames(nm = c(TRUE, FALSE),
                                                 aquarius::gg_color_hue(2)),
                           x_thresh = cut_nFeature_RNA,
                           y_thresh = cut_log_nCount_RNA,
                           bins = 200)
```


### Visualization as piechart

Do filtered cells belong to a particular cell type ?

```{r qc_piechart_cell_type, class.source = "fold-hide", fig.width = 12, fig.height = 9}
sobj$all_cells = TRUE

plot_list = list()

## All cells
df = sobj@meta.data
if (nrow(df) == 0) {
  plot_list[[1]] = ggplot()
} else {
  plot_list[[1]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "All cells",
                  subtitle = paste(nrow(df), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## Doublets consensus
df = sobj@meta.data %>%
  dplyr::filter(doublets_consensus.class)
if (nrow(df) == 0) {
  plot_list[[2]] = ggplot()
} else {
  plot_list[[2]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = "doublets_consensus.class",
                  subtitle = paste(sum(sobj$doublets_consensus.class, na.rm = TRUE), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.mt
df = sobj@meta.data %>%
  dplyr::filter(percent.mt > cut_percent.mt)
if (nrow(df) == 0) {
  plot_list[[3]] = ggplot()
} else {
  plot_list[[3]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.mt >", cut_percent.mt),
                  subtitle = paste(length(fail_percent.mt), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## percent.rb
df = sobj@meta.data %>%
  dplyr::filter(percent.rb > cut_percent.rb)
if (nrow(df) == 0) {
  plot_list[[4]] = ggplot()
} else {
  plot_list[[4]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("percent.rb >", cut_percent.rb),
                  subtitle = paste(length(fail_percent.rb), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## log_nCount_RNA
df = sobj@meta.data %>%
  dplyr::filter(log_nCount_RNA < cut_log_nCount_RNA)
if (nrow(df) == 0) {
  plot_list[[5]] = ggplot()
} else {
  plot_list[[5]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("log_nCount_RNA <", round(cut_log_nCount_RNA, 2)),
                  subtitle = paste(length(fail_log_nCount_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

## nFeature_RNA
df = sobj@meta.data %>%
  dplyr::filter(nFeature_RNA < cut_nFeature_RNA)
if (nrow(df) == 0) {
  plot_list[[6]] = ggplot()
} else {
  plot_list[[6]] = aquarius::plot_piechart(df = df,
                                           logical_var = "all_cells",
                                           grouping_var = "cell_type",
                                           colors = color_markers,
                                           display_legend = TRUE) +
    ggplot2::labs(title = paste("nFeature_RNA <", round(cut_nFeature_RNA, 2)),
                  subtitle = paste(length(fail_nFeature_RNA), "cells")) +
    ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"),
                   plot.subtitle = element_text(hjust = 0.5))
}

patchwork::wrap_plots(plot_list, ncol = 3) +
  patchwork::plot_layout(guides = "collect") &
  ggplot2::theme(legend.position = "right")
```

```{r clean_qc_4, echo = FALSE}
sobj$all_cells = NULL

rm(plot_list, df)
```


### Doublet cells status

We can compare doublet detection methods with a Venn diagram : 

```{r qc_venn_doublet, class.source = "fold-hide", fig.width = 8, fig.height = 6}
ggvenn::ggvenn(list(hybrid = fail_doublets_hybrid,
                    scDblFinder = fail_doublets_scDblFinder), 
               fill_color = c("#0073C2FF", "#EFC000FF"),
               stroke_size = 0.5, set_name_size = 4) +
  ggplot2::ggtitle(label = "Doublet cells") +
  ggplot2::theme(plot.title = element_text(hjust = 0.5, face = "bold"))
```

We visualize cells annotation for doublets :

```{r doublets_proj, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
plot_list = list()

# scDblFinder.class
sobj$fail = ifelse(sobj$scDblFinder.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "scDblFinder.class",
                subtitle = paste0(sum(sobj$scDblFinder.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() + Seurat::NoLegend() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

# hybrid_score.class
sobj$fail = ifelse(sobj$hybrid_score.class,
                   yes = as.character(sobj$cell_type), no = NA)
sobj$fail = factor(sobj$fail, levels = c(levels(sobj$cell_type), NA))

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "fail",
                                 na.value = "gray80", cols = color_markers) +
  ggplot2::labs(title = "hybrid_score.class",
                subtitle = paste0(sum(sobj$hybrid_score.class, na.rm = TRUE), " cells")) +
  Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

sobj$fail = NULL

# Plot
patchwork::wrap_plots(plot_list, nrow = 1)
```

## Save

We could save this object before filtering (remove `eval = FALSE`) :

```{r save_sobj_unfiltered_annotated, eval = FALSE}
saveRDS(sobj, paste0(out_dir, "/datasets/", sample_name, "_sobj_unfiltered.rds"))
```


# Filtering

We remove :

* cells with a number of UMI lower than `r cut_log_nCount_RNA`
* cells expressing a number of genes lower than `r cut_nFeature_RNA`
* cells having more than `r cut_percent.mt` \% of UMI related to mitochondrial genes
* cells having more than `r cut_percent.rb` \% of UMI related to ribosomal genes
* doublet cells detected with both method (**scDblFinder** and **scds-hybrid**)


```{r filter_cells}
sobj = subset(sobj, invert = TRUE,
              cells = unique(c(fail_log_nCount_RNA, fail_nFeature_RNA,
                               fail_percent.mt, fail_percent.rb,
                               fail_doublets_consensus)))
sobj
```

```{r clean_filter, echo = FALSE}
rm(fail_doublets_consensus, fail_doublets_scDblFinder, fail_doublets_hybrid,
   fail_percent.mt, fail_percent.rb, fail_log_nCount_RNA, fail_nFeature_RNA,
   cut_percent.mt, cut_percent.rb, cut_log_nCount_RNA, cut_nFeature_RNA)
```


# Post-filtering processing

## Normalization

We normalize the count matrix for remaining cells :

```{r normalization_3}
sobj = Seurat::NormalizeData(sobj,
                             normalization.method = "LogNormalize",
                             assay = "RNA")

sobj = Seurat::FindVariableFeatures(sobj,
                                    assay = "RNA",
                                    nfeatures = 3000)
sobj
```

## Projection

We perform a PCA :

```{r pca, fig.width = 12, fig.height = 4}
sobj = aquarius::dimensions_reduction(sobj = sobj,
                                      assay = "RNA",
                                      reduction = "pca",
                                      max_dims = 100,
                                      verbose = FALSE)
Seurat::ElbowPlot(sobj, ndims = 100, reduction = "RNA_pca")
```


We generate a tSNE and a UMAP with 20 principal components :

```{r tsne_umap}
ndims = 20
sobj = Seurat::RunTSNE(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_tsne"))

sobj = Seurat::RunUMAP(sobj,
                       reduction = "RNA_pca",
                       dims = 1:ndims,
                       seed.use = 1337L,
                       reduction.name = paste0("RNA_pca_", ndims, "_umap"))
```


## Annotation

We annotate cells for cell type, with the new normalized expression matrix :

```{r cell_type_2, time_it = TRUE}
score_columns = grep(x = colnames(sobj@meta.data), pattern = "^score", value = TRUE)
sobj@meta.data[, score_columns] = NULL
sobj$cell_type = NULL

sobj = aquarius::cell_annot_custom(sobj,
                                   newname = "cell_type",
                                   markers = cell_markers,
                                   use_negative = TRUE,
                                   add_score = TRUE,
                                   verbose = TRUE)

sobj$cell_type = factor(sobj$cell_type, levels = names(cell_markers))

colnames(sobj@meta.data) = stringr::str_replace_all(string = colnames(sobj@meta.data),
                                                    pattern = " ",
                                                    replacement = "_")

table(sobj$cell_type)
```


To justify cell type annotation, we can make a dotplot :

```{r dotplot_cell_type_short2, fig.width = 12, fig.height = 9}
markers = c("PTPRC", unique(unlist(dotplot_markers[levels(sobj$cell_type)])))
markers = markers[markers %in% rownames(sobj)]

aquarius::plot_dotplot(sobj, assay = "RNA",
                       column_name = "cell_type",
                       markers = markers,
                       nb_hline = 0) +
  ggplot2::scale_color_gradientn(colors = aquarius:::color_gene) +
  ggplot2::theme(legend.position = "right",
                 legend.box = "vertical",
                 legend.direction = "vertical",
                 axis.title = element_blank(),
                 axis.text = element_text(size = 15))
```

We can make a barplot to see the composition of each dataset, and visualize cell types on the projection.

```{r barplot_celltype2, fig.height = 6, fig.width = 8, class.source = "fold-hide"}
df_proportion = as.data.frame(prop.table(table(sobj$orig.ident,
                                               sobj$cell_type)))
colnames(df_proportion) = c("orig.ident", "cell_type", "freq")

quantif = table(sobj$orig.ident) %>%
  as.data.frame.table() %>%
  `colnames<-`(c("orig.ident", "nb_cells"))

# Plot
plot_list = list()

plot_list[[2]] = aquarius::plot_barplot(df = df_proportion,
                                        x = "orig.ident",
                                        y = "freq",
                                        fill = "cell_type",
                                        position = ggplot2::position_fill()) +
  ggplot2::scale_fill_manual(name = "Cell type",
                             values = color_markers[levels(df_proportion$cell_type)],
                             breaks = levels(df_proportion$cell_type)) +
  ggplot2::geom_label(data = quantif, inherit.aes = FALSE,
                      aes(x = orig.ident, y = 1.05, label = nb_cells),
                      label.size = 0)

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cell_type",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::scale_color_manual(values = unlist(color_markers),
                              breaks = names(color_markers)) +
  ggplot2::labs(title = sample_name,
                subtitle = paste0(ncol(sobj), " cells")) +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1, widths = c(6, 1))
```


### Cell cycle

We annotate cells for cell cycle phase :

```{r cell_cycle2, time_it = TRUE}
cc_columns = aquarius::add_cell_cycle(sobj = sobj,
                                      assay = "RNA",
                                      species_rdx = "hs",
                                      BPPARAM = cl)@meta.data[, c("Seurat.Phase", "Phase")]

sobj$Seurat.Phase = cc_columns$Seurat.Phase
sobj$cyclone.Phase = cc_columns$Phase

table(sobj$Seurat.Phase, sobj$cyclone.Phase)
```


We visualize cell cycle on the projection :

```{r see_cell_cycle2, fig.width = 12, fig.height = 7, class.source = "fold-hide"}
plot_list = list()

plot_list[[2]] = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "Seurat.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

plot_list[[1]] = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                                 reduction = "RNA_pca_20_tsne") +
  ggplot2::labs(title = "Cell Cycle Phase",
                subtitle = "cyclone.Phase") +
  Seurat::NoLegend() + Seurat::NoAxes() +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5))

patchwork::wrap_plots(plot_list, nrow = 1)
```


## Clustering

We make a highly resolutive clustering :

```{r clustering}
sobj = Seurat::FindNeighbors(sobj, reduction = "RNA_pca", dims = c(1:ndims))
sobj = Seurat::FindClusters(sobj, resolution = 2)

table(sobj$seurat_clusters)
```


# Visualization

## Cell type

We can visualize the cell type :

```{r see_cell_type, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_tsne"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cell_type",
                       reduction = paste0("RNA_pca_", ndims, "_umap"), cols = color_markers) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```

## Cell cycle

We can visualize the cell cycle, from Seurat :

```{r see_cc_Seurat, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "Seurat.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


We can visualize the cell cycle, from cyclone :

```{r see_cc_cyclone, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "cyclone.Phase",
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Clusters

We visualize the clustering :

```{r see_clusters, fig.width = 12, fig.height = 6, class.source = "fold-hide"}
tsne = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_tsne")) +
  Seurat::NoAxes() + ggplot2::ggtitle("tSNE") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5),
                 legend.position = "none")

umap = Seurat::DimPlot(sobj, group.by = "seurat_clusters", label = TRUE,
                       reduction = paste0("RNA_pca_", ndims, "_umap")) +
  Seurat::NoAxes() + ggplot2::ggtitle("UMAP") +
  ggplot2::theme(aspect.ratio = 1,
                 plot.title = element_text(hjust = 0.5))

tsne | umap
```


## Gene expression

We visualize all cell types markers on the tSNE :

```{r plot_genes, fig.width = 12, fig.height = 20}
markers = dotplot_markers %>% unlist() %>% unname()
markers = markers[markers %in% rownames(sobj)]

plot_list = lapply(markers,
                   FUN = function(one_gene) {
                     p = Seurat::FeaturePlot(sobj, features = one_gene,
                                             reduction = paste0("RNA_pca_", ndims, "_tsne")) +
                       ggplot2::labs(title = one_gene) +
                       ggplot2::scale_color_gradientn(colors = aquarius::color_gene) +
                       ggplot2::theme(aspect.ratio = 1,
                                      plot.subtitle = element_text(hjust = 0.5)) +
                       Seurat::NoAxes()
                     return(p)
                   })

patchwork::wrap_plots(plot_list, ncol = 4)
```


# Save

We save the annotated and filtered Seurat object :

```{r save_sobj_filtered_annotated}
saveRDS(sobj, file = paste0(out_dir, "/datasets/", sample_name, "_sobj_filtered.rds"))
```


# R session

```{r sessioninfo, echo = FALSE, fold_output = TRUE}
sessionInfo()
```
